import pandas as pd
import numpy as np
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
plt.style.use('seaborn-white')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
Fitting a predictive model using a supervised learning technique, we can check our work by seeing how well our model predicts the response $Y$ on observations not used in fitting the model. On the contrary, this is not the case when it comes to using unsupervised learning. This means there is no way to check our work because we don’t know the true answer—the problem is unsupervised.
In dealing with a large set of correlated variables, principal components can be handy by means of summarizing this set with a smaller number of representative variables that collectively explain most of the variability in the original set.
Suppose that we wish to visualize n observations with measurements on a set of p features, $X_1$, $X_2$, $\ldots$ , $X_p$, as part of an exploratory data analysis.
Letting $z_{ij} = \sum_{j=1}^{p} \phi_{j1} x_{ij}$, we have can express this problem as follows,
$$\max_{\phi_{11}, \ldots, \phi_{p1}}\left\{ \frac{1}{n} \sum_{i=1}^{n} z_{ij}^2\right \}, \quad \mbox{subject to} \sum_{j=1}^{p} \phi_{j1}^2.$$Note that $\frac{1}{n}\sum_{i=1}^{n} z_{ij}^2 = 0$ as $\frac{1}{n}\sum_{i=1}^{n} x_{ij} = 0$. Moreover, $z_{ij}$ are often refered as the scores of the first principal component.
where $\phi_{2} =\left[ \phi_{12}, \phi_{22},\ldots ,\phi_{p2}\right] $is the second principal component loading vector.
This dataset can be extracted from the ISLR package using the following syntax.
library (ISLR)
write.csv(USArrests, "USArrests.csv")
USArrests = pd.read_csv('Data/USArrests.csv', index_col=0)
display(USArrests.head(10))
We can also examine the variances of the four variables
display(USArrests.var().to_frame('Variance').style.background_gradient(cmap='Reds', subset=['Variance']).set_precision(2))
We can scale the variables to have standard deviation one. This can be done using StandardScaler.
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(USArrests), index = USArrests.index, columns = USArrests.columns)
Now,
display(df.var().to_frame('Variance').style.background_gradient(cmap='Greens', subset=['Variance']).set_precision(2))
# The loading vectors
pca_loadings = pd.DataFrame(PCA().fit(df).components_.T, index=df.columns, columns=['V1', 'V2', 'V3', 'V4'])
display(pca_loadings)
# Fit the PCA model and transform X to get the principal components
pca = PCA()
df_plot = pd.DataFrame(pca.fit_transform(df), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=df.index)
df_plot.head(10)
fig , ax = plt.subplots(figsize=(12,12))
_ = ax.set_xlim(-3.5,3.5)
_ = ax.set_ylim(-3.5,3.5)
# Plot Principal Components 1 and 2
for i in df_plot.index:
ax.annotate(i, (df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')
# Plot reference lines
_ = ax.hlines(0,-3.5,3.5, lw = 1.5, linestyles='dotted', colors='DarkGreen')
_ = ax.vlines(0,-3.5,3.5, lw = 1.5, linestyles='dotted', colors='DarkGreen')
_ = ax.set_xlabel('First Principal Component')
_ = ax.set_ylabel('Second Principal Component')
# Creating a a second y-axis for Principal Component loading vectors Plot
ax1 = ax.twinx().twiny()
_ = ax1.set_ylim(-1,1)
_ = ax1.set_xlim(-1,1)
_ = ax1.tick_params(axis='y', colors='orange')
_ = ax1.set_xlabel('Principal Component loading vectors', color='OrangeRed')
a = 1.07
for i in pca_loadings[['V1', 'V2']].index:
_ = ax1.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='OrangeRed')
# Plot vectors
_ = ax1.arrow(0,0, pca_loadings.V1[0], -pca_loadings.V2[0], head_width=0.02, head_length=0.05, lw = 1.5, fc='Blue', ec='k')
_ = ax1.arrow(0,0, pca_loadings.V1[1], -pca_loadings.V2[1], head_width=0.02, head_length=0.05, lw = 1.5, fc='Blue', ec='k')
_ = ax1.arrow(0,0, pca_loadings.V1[2], -pca_loadings.V2[2], head_width=0.02, head_length=0.05, lw = 1.5, fc='Blue', ec='k')
_ = ax1.arrow(0,0, pca_loadings.V1[3], -pca_loadings.V2[3], head_width=0.02, head_length=0.05, lw = 1.5, fc='Blue', ec='k')
We can obtain a summary of the proportion of variance explained (PVE) of the first few principal components.
Results = pd.DataFrame(columns = df_plot.columns)
# Standard deviation of each principal
Results = Results.append(pd.DataFrame(np.sqrt(pca.explained_variance_),
index = df_plot.columns, columns = ['Standard Deviation']).T)
# The variance explained by each principal component
Results = Results.append(pd.DataFrame(pca.explained_variance_, index = df_plot.columns, columns = ['Explained Variance']).T)
# Explained variance ratio each principal
Results = Results.append(pd.DataFrame(pca.explained_variance_ratio_,
index = df_plot.columns, columns = ['Proportion of Variance']).T)
# Cumulative Proportion
Results = Results.append(pd.DataFrame(np.cumsum(pca.explained_variance_ratio_),
index = df_plot.columns, columns = ['Cumulative Proportion']).T)
display(Results.style.set_precision(4))
We see that the first principal component explains 62.0 % of the variance in the data, the next principal component explains 24.7 % of the variance, and so forth. We can plot the proportion of variance explained (PVE) explained by each component, as well as the cumulative PVE, as follows:
fig = go.Figure()
fig.add_trace(go.Scatter(x= np.arange(1, len(pca.explained_variance_ratio_)+1),
y= pca.explained_variance_ratio_, line=dict(color='OrangeRed', width= 1.5),
name = 'Individual Component'))
# numpy.cumsum: Return the cumulative sum of the elements along a given axis.
fig.add_trace(go.Scatter(x= np.arange(1, len(pca.explained_variance_ratio_)+1),
y= np.cumsum(pca.explained_variance_ratio_), line=dict(color='purple', width= 1.5),
name = 'Cumulative'))
# Legend
fig.update_layout(legend=dict(traceorder="normal", bordercolor="DarkGray", borderwidth=1))
# Update
delta = .01
fig.update_layout(dragmode='select', plot_bgcolor= 'white', height=600, width=720, hovermode='closest')
fig.update_xaxes(title_text='Principal Component', range=[1-delta, 4+delta],)
fig.update_yaxes(title_text='Proportion of Variance Explained', range=[0-delta, 1+delta])
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
We begin with a simple simulated example in which there truly are two clusters in the data: the first 25 observations have a mean shift relative to the next 25 observations.
# Generate data
np.random.seed(2)
X = np.random.standard_normal((50,2))
X[:25,0] = X[:25,0]+3
X[:25,1] = X[:25,1]-4
We now perform K-means clustering with K = 2.
km1 = KMeans(n_clusters=2, n_init=20)
_ = km1.fit(X)
pd.Series(km1.labels_).value_counts(sort = False).to_frame('Count').T
We can also performe K-means clustering on this example with K = 3.
np.random.seed(4)
km2 = KMeans(n_clusters=3, n_init=20)
_ = km2.fit(X)
pd.Series(km2.labels_).value_counts(sort = False).to_frame('Count').T
def Plot_KNN(cls, X, h=0.02, pad=0.25, ax = False, fs = 7,
ColorMap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['OrangeRed', 'Lime', 'RoyalBlue'])):
y = cls.labels_
# adding margins
x_min, x_max = X[:, 0].min()-pad, X[:, 0].max()+pad
y_min, y_max = X[:, 1].min()-pad, X[:, 1].max()+pad
# Generating meshgrids
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predictions
Pred = cls.predict(np.c_[xx.ravel(), yy.ravel()])
Pred = Pred.reshape(xx.shape)
# Figure
if ax == False:
fig, ax = plt.subplots(1, 1, figsize=(fs, fs))
_ = ax.contourf(xx, yy, Pred, cmap = ColorMap, alpha=0.2)
_ = ax.scatter(X[:,0], X[:,1], s=70, c=y, cmap = ColorMap)
# Cluster Centers
cc = cls.cluster_centers_
_ = ax.scatter(cc[:,0], cc[:,1], marker=(5, 1), facecolors= 'Yellow', edgecolor = 'Black',
s=150, linewidths='1', label = 'Cluster Centers')
_ = ax.set_xlim(x_min, x_max)
_ = ax.set_ylim(y_min, y_max)
_ = ax.set_xlabel(r'$X_1$')
_ = ax.set_ylabel(r'$X_2$')
_ = ax.legend(loc='lower left', fontsize = 14)
plt.style.use('seaborn-white')
fig, ax = plt.subplots(1, 2, figsize=(15, 6.8))
# Left
_ = Plot_KNN(km1, X= X, ax = ax[0])
_ = ax[0].set_title(r'K-means clustering with $K = 2$')
# Right
_ = Plot_KNN(km2, X= X, ax = ax[1])
_ = ax[1].set_title(r'K-means clustering with $K = 3$')
In this section we use scipy.cluster.hierarchy.dendrogram to plot the hierarchical clustering as a dendrogram.
For this data, complete and average linkage generally separate the observations into their correct groups. However, single linkage identifies one point as belonging to its own cluster. A more sensible answer is obtained when four clusters are selected, although there are still two singletons.
fig, axes = plt.subplots(3,1, figsize=(15,18))
for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], axes):
cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0, show_contracted=True)
_ = axes[0].set_title('Complete Linkage')
_ = axes[1].set_title('Average Linkage')
_ = axes[2].set_title('Single Linkage');
In this section, we work on NCI60 cancer cell line microarray data, which consists of 6,830 gene expression measurements on 64 cancer cell lines.
This dataset can be extracted from the ISLR package using the following syntax.
library (ISLR)
write.csv(NCI60, "NCI60.csv")
df = pd.read_csv('Data/NCI60.csv', index_col=0)
display(df.head())
# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df.drop(columns = 'labs').values)
y = df['labs'].values
display(pd.Series(y).value_counts(sort = False).to_frame('Count').T)
# Fit the PCA model and transform X to get the principal components
pca = PCA()
df_plot = pd.DataFrame(pca.fit_transform(X))
labs = df['labs'].unique().tolist()
color_idx =pd.Series(y)
fig = go.Figure()
fig = make_subplots(rows=1, cols=2, shared_yaxes=True)
for i in labs:
fig.add_trace(go.Scatter(x = df_plot.iloc[color_idx[color_idx == i].index.to_list(),0],
y = -df_plot.iloc[color_idx[color_idx == i].index.to_list(),1],
mode='markers', name=i, showlegend = False), 1, 1)
fig.add_trace(go.Scatter(x = df_plot.iloc[color_idx[color_idx == i].index.to_list(),0],
y = -df_plot.iloc[color_idx[color_idx == i].index.to_list(),2],
mode='markers', name=i, showlegend = True), 1, 2)
del i
# Updates
fig.update_traces(mode='markers', marker_line_width=1, marker_size=10)
fig.update_layout(legend=dict(traceorder="normal", bordercolor="DarkGray", borderwidth=1))
fig.update_xaxes(title_text='Principal Component 1', range = [-75, 75], tickvals=list(np.arange(-75, 100, 25)), row=1, col=1)
fig.update_yaxes(title_text='Principal Component 2', range = [-75, 75], tickvals=list(np.arange(-75, 100, 25)), row=1, col=1)
fig.update_xaxes(title_text='Principal Component 1', range = [-75, 75], tickvals=list(np.arange(-75, 100, 25)), row=1, col=2)
fig.update_yaxes(title_text='Principal Component 3', range = [-75, 75], tickvals=list(np.arange(-75, 100, 25)), row=1, col=2)
fig.update_xaxes(ticks="outside", tickwidth=2, tickcolor= 'MediumBlue', ticklen=5)
fig.update_yaxes(ticks="outside", tickwidth=2, tickcolor= 'MediumBlue', ticklen=5)
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(automargin=True)
fig.show()
Projections of the NCI60 cancer cell lines onto the first three principal components (in other words, the scores for the first three principal components). On the whole, observations belonging to a single cancer type tend to lie near each other in this low-dimensional space. It would not have been possible to visualize the data without using a dimension reduction method such as PCA since based on the full data set there are $\begin{pmatrix}6,830\\2\end{pmatrix}$ possible scatterplots, none of which would have been particularly informative.
fig = go.Figure()
for i in labs:
fig.add_trace(go.Scatter3d(x = df_plot.iloc[color_idx[color_idx == i].index.to_list(),0],
y = -df_plot.iloc[color_idx[color_idx == i].index.to_list(),1],
z = -df_plot.iloc[color_idx[color_idx == i].index.to_list(),2],
mode='markers', name=i, showlegend = True))
del i
# Updates
fig.update_traces(mode='markers', marker_line_width=1, marker_size=5)
fig.update_layout(legend=dict(traceorder="normal", bordercolor="DarkGray", borderwidth=1))
fig.update_layout(scene = dict(xaxis_title='Principal Component 1', xaxis = dict(nticks=5, range=[-100, 100],),
yaxis_title='Principal Component 2', yaxis = dict(nticks=5, range=[-100, 100],),
zaxis_title='Principal Component 3', zaxis = dict(nticks=5, range=[-100, 100],),),)
fig.show()
pd.DataFrame([df_plot.iloc[:,:5].std(axis=0, ddof=0).values,
pca.explained_variance_ratio_[:5],
np.cumsum(pca.explained_variance_ratio_[:5])],
index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'],
columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
# Delta Degrees of Freedom (ddof = 0)
Temp = df_plot.var(axis=0, ddof=0).to_frame('Variances')
Temp['Principal Components'] = np.arange(1,Temp.shape[0]+1)
#
fig = px.bar(Temp, x= 'Principal Components', y= 'Variances', text = 'Variances')
fig.update_traces(marker_color='DarkOrchid', marker_line_color='Indigo', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_xaxes(range = [0, Temp['Principal Components'].max()], tickvals= Temp['Principal Components'].tolist())
fig['layout']['yaxis'].update(range=[0, 8e2])
fig.update_layout(plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = go.Figure()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x = np.arange(1, len(pca.explained_variance_ratio_)+1),
y = 100*pca.explained_variance_ratio_,
mode='lines+markers', name = 'PVE', showlegend = False), 1, 1)
fig.add_trace(go.Scatter(x = np.arange(1, len(np.cumsum(pca.explained_variance_ratio_))+1),
y = 100*np.cumsum(pca.explained_variance_ratio_),
mode='lines+markers', name = 'Cumulative PVE', showlegend = False), 1, 2)
# Updates
fig.update_traces(mode='lines+markers', marker_line_width=1, marker_size=5)
fig.update_xaxes(title_text='Principal Components', range = [0, len(pca.explained_variance_ratio_)+1],
tickvals=list(np.arange(0, len(pca.explained_variance_ratio_)+2, 8)), row=1, col=1)
fig.update_yaxes(title_text='PVE (Percentage)', range = [0, 12],
tickvals=list(np.arange(0, 13, 2)), row=1, col=1)
fig.update_xaxes(title_text='Principal Components', range = [0, len(np.cumsum(pca.explained_variance_ratio_))+1],
tickvals=list(np.arange(0, len(np.cumsum(pca.explained_variance_ratio_))+2, 8)), row=1, col=2)
fig.update_yaxes(title_text='Cumulative PVE (Percentage)',
range = [0, 101], tickvals=list(np.arange(0, 105, 10)), row=1, col=2)
fig.update_xaxes(ticks="outside", tickwidth=2, tickcolor= 'MediumBlue', ticklen=5)
fig.update_yaxes(ticks="outside", tickwidth=2, tickcolor= 'MediumBlue', ticklen=5)
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(automargin=True)
fig.show()
The PVE of the principal components of the NCI60 cancer cell line microarray data set.
We now proceed to hierarchically cluster the cell lines in the NCI60 data. To begin, we standardize the variables to have mean zero and standard deviation one. As mentioned earlier, this step is optional and should be performed only if we want each gene to be on the same scale.
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(df.drop(columns = 'labs').values), index=df['labs'].values,
columns=df.iloc[:,:-1].columns)
y = df['labs'].values
fig, axes = plt.subplots(1,3, figsize=(20,20))
for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'], axes):
cluster = hierarchy.dendrogram(linkage, labels=X.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax)
axes[0].set_title('Complete Linkage')
axes[1].set_title('Average Linkage')
axes[2].set_title('Single Linkage');
The NCI60 cancer cell line microarray data, clustered with average, complete, and single linkage, and using Euclidean distance as the dissimilarity measure. Complete and average linkage tends to yield evenly sized clusters whereas single linkage tends to yield extended clusters to which single leaves are fused one by one.
fig, ax = plt.subplots(1,1, figsize=(10,20))
_ = hierarchy.dendrogram(hierarchy.complete(X),
labels=X.index, orientation='right', color_threshold=140, leaf_font_size=10)
_ = ax.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed');
How do these NCI60 hierarchical clustering results compare to what we get if we perform K-means clustering with K = 4?
np.random.seed(2)
km = KMeans(n_clusters=4, n_init=50)
_ = km.fit(X)
pd.Series(km.labels_).value_counts(sort = False).to_frame('Counts').T
Observations per Hierarchical cluster
_ = hierarchy.dendrogram(hierarchy.complete(X), truncate_mode='lastp', p=4, show_leaf_counts=True)
Hierarchy based on Principal Components 1 to 5
fig, ax = plt.subplots(1,1, figsize=(10,20))
_ = hierarchy.dendrogram(hierarchy.complete(df_plot.iloc[:,:5]), labels=y, orientation='right',
color_threshold=100, leaf_font_size=10)
_ = hierarchy.dendrogram(hierarchy.complete(df_plot), truncate_mode='lastp', p=4,
show_leaf_counts=True)
James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, pp. 3-7). New York: springer.
Jordi Warmenhoven, ISLR-python